library(foreign)
library(tm)
library(topicmodels)
library(quanteda)
library(stringr)
library(slam)
library(stm)
library(RWeka)
library(arm)

# load opinions.
ops <- VCorpus(DirSource("c:/users/doug/dropbox/topicmodels/SCOTUS/cleanedOutput/"))

# get opinion names.
opsNames <- names(ops)

# drop if not a majority opinion
toKeep <- which(str_extract(opsNames, "_[a-z]+") == "_opinion")
ops <- ops[toKeep]
opsNames <- names(ops)

# create usCite
volumes <- str_extract(opsNames, "out_[0-9]+_")
volumes <- gsub("out_", "", volumes)
volumes <- gsub("_", "", volumes)
pages <- str_extract(opsNames, "_.+_+op")
pages <- gsub("^_[0-9]+_", "", pages)
pages <- gsub("_+op", "", pages)
mergeCite <-  paste(volumes, pages, sep = " U.S. ")


# ------------------------------------------------------------------------------------------------------------------
# need to add justice names from case info if you want to include dissents/concurrences
# ------------------------------------------------------------------------------------------------------------------

# open SCDB.
load("c:/users/doug/dropbox/affirmReverse/data/LegacyDataJustice.RData")

# drop each to just majority opinions
scdb <- subset(scdb, opinion > 1 & vote == 1)
scdb <- scdb[!duplicated(scdb$caseId),]

# match, and find what I'm missing.
volumes <- str_extract(scdb$usCite, "^[0-9]+ U.")
volumes <- gsub(" U.", "", volumes)
scdb$mergeCite <- scdb$usCite
scdb$mergeCite[which(as.numeric(volumes) > 539)] <- paste(volumes[which(as.numeric(volumes) > 539)], scdb$docket[which(as.numeric(volumes) > 539)], sep=" U.S. ")

# drop post-2010
scdb <- subset(scdb, term < 2011)
scdb <- subset(scdb, term > 1802)

# keep only opinions for which we have case info
toKeep <- which(mergeCite %in% scdb$mergeCite)
ops <- ops[toKeep]
opsNames <- names(ops)

# create usCite
volumes <- str_extract(opsNames, "out_[0-9]+_")
volumes <- gsub("out_", "", volumes)
volumes <- gsub("_", "", volumes)
pages <- str_extract(opsNames, "_.+_+op")
pages <- gsub("^_[0-9]+_", "", pages)
pages <- gsub("_+op", "", pages)
mergeCite <-  paste(volumes, pages, sep = " U.S. ")
mergeCite <- unique(mergeCite)

# keep only scdb cases for which we have opinions
scdb <- scdb[!duplicated(scdb$mergeCite),]
toKeep <- which(scdb$mergeCite %in% mergeCite)
scdb <- scdb[toKeep,]

# order them equivalently
ops <- ops[order(mergeCite)]
scdb <- scdb[order(scdb$mergeCite),]
ops <- tm_map(ops, stripWhitespace)

# pre-process
ops <- tm_map(ops, content_transformer(tolower))
ops <- tm_map(ops, removePunctuation, preserve_intra_word_dashes=T)
ops <- tm_map(ops, removeNumbers)
ops <- tm_map(ops, stripWhitespace)

# create dtm with unigrams
dtmUni <- DocumentTermMatrix(ops)

# drop docs with fewer than 25 words
drop <- as.vector(which(row_sums(dtmUni) < 25))
ops <- ops[-drop]
scdb <- scdb[-drop,]
dtmUni <- DocumentTermMatrix(ops)

# only keep top 5,000 words
wc <- col_sums(dtmUni)
cutoff <- as.numeric(rev(sort(wc))[5000])
dtmUni <- dtmUni[,which(wc>(cutoff-1))]

# now create dtms with bigrams
BigramTokenizer <- function(x) NGramTokenizer(x, Weka_control(min=2, max=2))
dtmBi <- DocumentTermMatrix(ops, control=list(tokenize=BigramTokenizer))
# only keep top 2500
wc <- col_sums(dtmBi)
cutoff <- as.numeric(rev(sort(wc))[2500])
dtmBi <- dtmBi[,which(wc>(cutoff-1))]

# combine dtms
dtmBig <- cbind(dtmUni, dtmBi)

# drop stopwords
toDrop <- which(colnames(dtmBig) %in% stopwords("SMART"))
dtmBig <- dtmBig[,-toDrop]

# convert to LDA-C format
myCorpus <- as.matrix(dtmBig)
myCorpus <- readCorpus(myCorpus, type="dtm")

# add counter to SCDB
scdb$time <- scdb$term - 1791

# create STM data
out <- prepDocuments(myCorpus$documents, myCorpus$vocab, scdb)

#-----------------------------------------------------------
# Estimate STM & Plot
#-----------------------------------------------------------
k <- 5
myFit <- stm(out$documents, out$vocab, K=k, prevalence =~ s(time), max.em.its=75, data=out$meta, init.type = "Spectral")


# pull topic names
myLabels <- labelTopics(myFit)
myLabels <- myLabels$frex[,1:3]
myTopics <- apply(myLabels, 1, paste, collapse=", ")

# pull covariate info for plots
prep <- estimateEffect(1:k ~ s(time), myFit, meta=out$meta, uncertainty="Global")

# plot changes in predicted topic proportion over time
years <- sort(unique(out$meta$term))
yearLabs <- c(1850,1900,1950,2000)
ticks <- c(which(years == 1850), which(years==1900), which(years==1950), which(years==2000))
par(mfrow=c(3,5))
par(mar=c(1,1,1,1))
for (i in 1:k){
  plot.estimateEffect(prep, covariate = "time", method = "continuous", topics=i, model=myFit, labeltype="custom",  custom.labels = myTopics[i], ylim=c(0,.2), xaxt="n")
  axis(1, at=ticks, labels=yearLabs)
}

# =-=-=-=-=-=-=-=-=-=-=- #
# Merge Topics to SCDB   #
# =-=-=-=-=-=-=-=-=-=-=- #

topicProps <- myFit$theta
colnames(topicProps) <- myTopics
ourData <- cbind(scdb, topicProps)

# =-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=- #
# comparison of agenda change measures   #
# =-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=- #

# start with scdb
par(mar=c(2,2,2,1))
par(mfrow=c(5,3))
 
# topic proportions by SCDB
scdbTab <- prop.table(table(scdb$term, scdb$issueArea),1)
years <- as.numeric(rownames(scdbTab))
scdbNames <- c("Criminal", "Civil Rights", "First Amdt.", "Due Process", "Privacy",
               "Attorneys", "Unions", "Economic Activity", "Judicial Power",
               "Federalism", "Interstate", "Fed Taxation", "Misc", "Private Law")
for (i in 1:ncol(scdbTab)){
  # compute moving average
  scdbMa <- rollmean(scdbTab[,i], k=7)
  # plot
  plot(years[4:(length(years)-3)], scdbMa, type="l", ylim=c(0,.4), main=scdbNames[i])
}

# now turn to stm proportions (columns 64 to 88)
caseCounts <- as.numeric(table(ourData$term))

par(mfrow=c(5,3))
par(mar=c(2,2,2,1))
for (i in 1:k){
  # do each topic proportion individually
  issueSums <- aggregate(ourData[,63+i], by=list(ourData$term), FUN=sum)[,2]
  stmAttn <- issueSums/caseCounts
  # compute moving average
  scdbMa <- rollmean(stmAttn, k=7)
  # plot
  plot(years[4:(length(years)-3)], scdbMa, type="l", ylim=c(0,.4), main=myTopics[i])
}

# =-=-=-=--=-=-=-=-=-=-=-=-=-=- #
# Big STM with Match to Values  #
# =-=-=-=--=-=-=-=-=-=-=-=-=-=- #

rm(list=ls())
load("c:/users/doug/dropbox/issueMeasures/data/stmOutputs/stmOutput-250topics-12122017.RData")

# top 3 for Roe v. Wade
roe <- subset(ourData, ourData$usCite == "410 U.S. 113")
roeProps <- roe[,64:313]
colnames(roeProps)[order(roeProps)]
# "freedom, people, liberty" "medical, hospital, health" "abortion, constitutionality, unconstitutional" 

# top 3 for Brown v. Board 347 US 483
brown <- subset(ourData, ourData$usCite == "347 U.S. 483")
brownProps <- brown[,64:313]
colnames(brownProps)[order(brownProps)]
# "schools, school, students", "race, racial, women", "fourteenth, the fourteenth, fourteenth amendment"

# top 3 for Plyler v. Doe 457 US 202
doe <- subset(ourData, ourData$usCite == "457 U.S. 202")
doeProps <- doe[,64:313]
colnames(doeProps)[order(doeProps)]
# "alien, aliens, citizenship", "race, racial, women", "at u, us at, s -", "fourteenth, the fourteenth, fourteenth amendment"

# =-=-=-=-=-=-=-=-=-=-=-=-=-=- #
#  Fix topic names
# =-=-=-=-=-=-=-=-=-=-=-=-=-=- #

condTopics <- gsub(", ","_", myTopics)
condTopics <- gsub("-", "", condTopics)
condTopics <- gsub(" ", "", condTopics)
colnames(ourData)[which(colnames(ourData) %in% myTopics)] <- condTopics

# =-=-=-=-=-=-=-=-=-=-=-=-=-=- #
#  Illustrative Cases 		   #
# =-=-=-=-==--=-=-=-=-=-=-=-=- #

myTable <- matrix(NA, length(condTopics),5)
scdbNames <-c("Criminal Procedure", "Civil Rights", "First Amendment", "Due Process",
				"Privacy", "Attorneys", "Unions", "Economic Activity", "Judicial Power",
				"Federalism", "Interstate Relations", "Federal Taxation", "Miscellaneous",
				"Private Action")

for (i in 1:length(condTopics)){
	tmpCase <- tail(ourData$caseName[order(ourData[,which(colnames(ourData)==condTopics[i])])],1)
	tmpIA <- tail(ourData$issueArea[order(ourData[,which(colnames(ourData)==condTopics[i])])],1)
	tmpTerm <- tail(ourData$term[order(ourData[,which(colnames(ourData)==condTopics[i])])],1)
	tmpIssue <- tail(ourData$issue[order(ourData[,which(colnames(ourData)==condTopics[i])])],1)
	
	myTable[i,] <- c(tmpCase, tmpTerm, scdbNames[tmpIA], tmpIssue, condTopics[i])
}


# =-=-=-=--=-=-=-=-=-=-=-=-=-=- #
# Replication 1: Lax & Rader    #
# =-=-=-=--=-=-=-=-=-=-=-=-=-=- #


library("arm")
library("foreign")
library("car")
library("blme")
library("matrixStats")

rm(list=ls())
load("c:/users/doug/dropbox/issueMeasures/data/stmOutputs/stmOutput-50topics-01062018.RData")

# pull topic names
myLabels <- labelTopics(myFit)
myLabels <- myLabels$frex[,1:3]
myTopics <- apply(myLabels, 1, paste, collapse=", ")
topicProps <- myFit$theta
colnames(topicProps) <- myTopics
ourData <- cbind(scdb, topicProps)
condTopics <- gsub(", ","_", myTopics)
condTopics <- gsub("-", "", condTopics)
condTopics <- gsub(" ", "", condTopics)
colnames(ourData)[which(colnames(ourData) %in% myTopics)] <- condTopics

# read in data and do a bunch of transformations they do
load("c:/Users/doug/Dropbox/issueMeasures/replications/Lax & Rader - JOP - 2016/RL CCOA use.RData")
ccoa<-x
ccoa$nyt<-(as.numeric(ccoa$nyt) - 1)
ccoa$laws<-(as.numeric(ccoa$laws) -1)
ccoa$opcount1 <- ccoa$opcount + 1
ccoa$assignor <- ifelse(ccoa$assignor=="Frankfur","Frankfurter", ccoa$assignor)
ccoa <- ccoa[order(ccoa$justice),]
ccoa$maj_coalition <- substr(tapply(ccoa$justice[ccoa$confM_justice==1], ccoa$us[ccoa$confM_justice==1], unique )[ccoa$us], 3,150)
ccoa$maj_coalition <-  gsub(")","",ccoa$maj_coalition)
ccoa$maj_coalition <-  gsub(",","",ccoa$maj_coalition)
ccoa$maj_coalition <-  gsub("\"","",ccoa$maj_coalition)
ccoa <- ccoa[order(ccoa$us),]
#above creates a variable containing list of majoriy coalition members
ccoa$natct_maj_coalition <- paste(ccoa$natct,ccoa$maj_coalition)
ccoa$assignorswitchvote <- ifelse(ccoa$justice==ccoa$assignor & ccoa$switch_rev==1, 1,0)
ccoa$assignorstuckinthiscase <-  1- tapply(ccoa$assignorswitchvote, ccoa$us, max)[ccoa$us]
ccoa$assignorstuckinthiscase <- ifelse(ccoa$assignor=="" |ccoa$assignor=="Stone" | ccoa$us=="353/0087" ,NA,ccoa$assignorstuckinthiscase)

ccoa$natct_maj_coalition_assignee <- paste(ccoa$natct_maj_coalition," assignee",ccoa$assignee, sep="")

# merge with my data
ccoa$volume <- str_extract(ccoa$us, "[0-9]+/")
ccoa$volume <- gsub("/", "", ccoa$volume)
ccoa$page <- str_extract(ccoa$us, "/[0-9]+")
ccoa$page <- gsub("/", "", ccoa$page)
ccoa$page <- gsub("^0+","", ccoa$page)
ccoa$usCite <- paste(ccoa$volume, ccoa$page, sep=" U.S. ")
lrData <- merge(ccoa, ourData, by=c("usCite"))

# regular model (note, no side jump included)
mlm.maj.48 <- glmer(formula = switch_rev ~ IP_signed + log(opcount1) + nyt + laws + IP_noncontig + assignee_justice + IP_marginal 
                    + uncert_justice + freshman  + IP_closer_min + IP_dist_assignee + IP_side:IP_dist_assignee + 
                      (1|conf_total_M) 
                    + (1|value) + (1|justice.x) + (1|natct), subset = confM_justice==1 & conf_total_M < 9, family = binomial(link="probit"), data=lrData)   
display(mlm.maj.48,detail=TRUE) 

# pull ests and ses for nomogram
ests <- summary(mlm.maj.48)$coefficients[,1]
ses <- summary(mlm.maj.48)$coefficients[,2]

# now demonstrate with different measure of legal provisions
props <- lrData[244:(244+k-1)]
props.sq <- props^2
lrData$herf <- -1 * ((rowSums(props.sq)-1/ncol(props.sq))/(1-(1/ncol(props.sq))))

## replication with herfindahl instead of legal provisions
mlm.herf <- glmer(formula = switch_rev ~ IP_signed + log(opcount1) + nyt + herf + IP_noncontig + assignee_justice + IP_marginal 
                    + uncert_justice + freshman  + IP_closer_min + IP_dist_assignee + IP_side:IP_dist_assignee + 
                      (1|conf_total_M) 
                    + (1|value) + (1|justice.x) + (1|natct), subset = confM_justice==1 & conf_total_M < 9, family = binomial(link="probit"), data=lrData)   
display(mlm.herf,detail=TRUE) 

# pull ests and ses for nomogram
estsHerf <- summary(mlm.herf)$coefficients[,1]
sesHerf <- summary(mlm.herf)$coefficients[,2]

# 15 not significant; 25 not significant; 50 not significant
plotIndex <- c(1:length(ests))
plotIndexPlus <- plotIndex+.17
par(mar=c(4,6,1,1))
par(mfrow=c(1,1))
plot(ests, plotIndex, pch=19, yaxt="n", ylab="", xlim=c(-1,1.2), xlab="", xaxt="n", ylim=c(0.5,length(ests)+.5))
segments(ests+1.96*ses,plotIndex,ests-1.96*ses,plotIndex)
par(new=T)
plot(estsHerf, plotIndexPlus, pch=17,col="grey41", yaxt="n", ylab="", xlim=c(-1,1.2), xlab="Coefficient Estimate", ylim=c(0.5,length(ests)+.5))
segments(estsHerf+1.96*sesHerf,plotIndexPlus,estsHerf-1.96*sesHerf,plotIndexPlus, col="grey41")
abline(v=0, lty=2)
axis(2, at=c(plotIndex), labels=names(ests), las=2)

# predicted probabilities plots
library(sjPlot)
library(ggplot2)
library(sjmisc)
set_theme(base=theme_bw())
plot_model(mlm.maj.48, terms=c("laws"), axis.lim=c(0,.125),type=c("eff"), title="", axis.title = c("More than One Legal Provision", "Probability"), colors=theme_bw())
plot_model(mlm.herf, terms=c("herf"), axis.lim=c(0,.125),type=c("eff"), title="", axis.title = c("Topic Dispersion", "Probability"), colors=theme_bw())

#=-=-=-=-=--=-=-=-
#  Baird Rep
#=-=-=-=-=-=-=-=-=

rm(list=ls())

# load stm output
topicChoices <- c(5,10,15,20,25,50)
for (j in 1:length(topicChoices)){

	# load data
	dataCall <- paste("load(\"c:/users/doug/dropbox/issueMeasures/data/stmOutputs/stmOutput-",topicChoices[j],"topics-01062018.RData\")", sep="")
	eval(parse(text=dataCall))
	
	# pull topic names
	myLabels <- labelTopics(myFit)
	myLabels <- myLabels$frex[,1:3]
	myTopics <- apply(myLabels, 1, paste, collapse=", ")

	# merge to scdb
	topicProps <- myFit$theta
	colnames(topicProps) <- myTopics
	ourData <- cbind(scdb, topicProps)

	# create condensed topic names
	condTopics <- gsub(", ","_", myTopics)
	condTopics <- gsub("-", "", condTopics)
	condTopics <- gsub(" ", "", condTopics)
	colnames(ourData)[which(colnames(ourData) %in% myTopics)] <- condTopics

	# add salience data
	load("c:/users/doug/dropbox/issueMeasures/data/scdbSalience.rdata")
	salData <- merge(ourData, scdbSalience, by=c("caseId"))

	# create policy-year dataset
	for (i in 1:k){
		tmp <- aggregate(salData[,which(colnames(salData) == condTopics[i])], by=list(salData$term), FUN=sum, na.rm=T)
		tmp$topic <- c(condTopics[i])
		colnames(tmp) <- c("term", "allCases", "topic")
		if (i == 1){
			pyOut <- tmp 
		}
		if (i > 1){
			pyOut <- rbind(pyOut, tmp)
		}
	}
	# create lag
	ltmp <- pyOut
	ltmp$term <- ltmp$term + 1
	colnames(ltmp) <- c("term", "l.allCases", "topic")
	pyOut <- merge(pyOut, ltmp, by=c("term", "topic"))


	# create salient * attention
	for (i in 1:k){
		sals <- salData[,which(colnames(salData) == condTopics[i])] * salData$nytSalience
		tmp <- aggregate(sals, by=list(salData$term), FUN=sum, na.rm=T)
		tmp$topic <- c(condTopics[i])
		colnames(tmp) <- c("term", "salientCases", "topic")
		if (i == 1){
			salOut <- tmp 
		}
		if (i > 1){
			salOut <- rbind(salOut, tmp)
		}
	}	

	# create lags
	ltmp <- salOut
	tmpBase <- ltmp
	ltmp$term <- ltmp$term + 1
	colnames(ltmp) <- c("term", "l.salientCases", "topic")
	salOut <- merge(salOut, ltmp, by=c("term", "topic"))
	ltmp <- tmpBase
	ltmp$term <- ltmp$term + 2
	colnames(ltmp) <- c("term", "l2.salientCases", "topic")
	salOut <- merge(salOut, ltmp, by=c("term", "topic"))
	ltmp <- tmpBase
	ltmp$term <- ltmp$term + 3
	colnames(ltmp) <- c("term", "l3.salientCases", "topic")
	salOut <- merge(salOut, ltmp, by=c("term", "topic"))
	ltmp <- tmpBase
	ltmp$term <- ltmp$term + 4
	colnames(ltmp) <- c("term", "l4.salientCases", "topic")
	salOut <- merge(salOut, ltmp, by=c("term", "topic"))
	ltmp <- tmpBase
	ltmp$term <- ltmp$term + 5
	colnames(ltmp) <- c("term", "l5.salientCases", "topic")
	salOut <- merge(salOut, ltmp, by=c("term", "topic"))
	ltmp <- tmpBase
	ltmp$term <- ltmp$term + 6
	colnames(ltmp) <- c("term", "l6.salientCases", "topic")
	salOut <- merge(salOut, ltmp, by=c("term", "topic"))

	# create dataset
	bairdData <- merge(pyOut, salOut, by=c("term", "topic"))

	# estimate model w/ lagged DV
	lmBairdFe <- lm(allCases ~ l.allCases + l.salientCases + l2.salientCases + l3.salientCases + l4.salientCases + l5.salientCases + l6.salientCases + factor(topic) + factor(term), data=bairdData)

	# pull estimates for plot
	ests <- coef(lmBairdFe)[3:8]
	lower <- coef(lmBairdFe)[3:8] - 1.96 * se.coef(lmBairdFe)[3:8]
	upper <- coef(lmBairdFe)[3:8] + 1.96 * se.coef(lmBairdFe)[3:8]

	# create ropeladder plot
	pdfCall <- paste("pdf(\"c:/users/doug/dropbox/issueMeasures/plots/ropeLadder-",topicChoices[j],".pdf\", width=4, height=4)", sep="")
	eval(parse(text=pdfCall))
	par(mar=c(1,4,1,1))
	plot(1:6, ests, pch=19, main="", xlab="", ylab="", bty="n", cex=1.5, ylim=c(-.8,.8), xaxt="n")
	abline(h=0,lty=2, lwd=1.5, col="gray60")
	segments(1:6, lower, 1:6, upper, lwd=2, col="gray50")
	points(1:6, ests, cex=1.30, col="black", pch=19)
	dev.off()

	# clear everythiing and do the next model
	rm(list=ls())
	gc()
	topicChoices <- c(5,10,15,20,25,50)
}
	
## =-=-=-=-=-=-=-=-=-=-=-=-=-=- ## 
##  Plot of Correlated Topics   ##
## =-=-=-=-=-=-=-=-=-=-=-=-=-=- ##

load("c:/users/doug/dropbox/issueMeasures/data/stmOutputs/stmOutput-15topics-01062018.RData")

# pull topic names
myLabels <- labelTopics(myFit)
myLabels <- myLabels$frex[,1:3]
myTopics <- apply(myLabels, 1, paste, collapse=", ")

# merge to scdb
topicProps <- myFit$theta
colnames(topicProps) <- myTopics
corrplot(cor(topicProps), method="color", order="hclust", addrect=4, tl.col="black")









